This vignette introduces scBubbletree, a transparent methodology for single cell RNA-seq data exploration based on well established methods clustering and visualization. We will demonstrate the functionality of scBubbletree by analyzing a large dataset of human PBMCs (Case studies B), while also showing how to integrate scBubbletree with existing pipelines for scRNA-seq analysis e.g. based on Seurat.
To run this vignette we need several R-packages. Load them now:
source(file = "../Rutil/Graphics.R")
source(file = "../scBubbletree/R/util.R")
source(file = "../scBubbletree/R/main.R")
source(file = "../scBubbletree/R/annotation.R")
library(cluster, "/usr/local/lib/R/site-library")
library(Seurat, lib.loc = lib.loc)
library(ggplot2, lib.loc = lib.loc)
library(reshape2, lib.loc = lib.loc)
library(parallel, lib.loc = lib.loc)
library(ape, "/usr/local/lib/R/site-library")
library(ggtree, lib.loc = lib.loc)
library(org.Hs.eg.db, lib.loc = lib.loc)
library(bluster, lib.loc = lib.loc)
library(SummarizedExperiment, lib.loc = lib.loc)
library(ggrepel, lib.loc = lib.loc)
In this case study we will analyze scRNA-seq dataset of approximately 161,000 PBMCs derived from 8 healthy donors, collected at three timepoints and sequenced by 10x Genomics protocol. This is a large and complex scRNA-seq dataset, perfect for demonstrating the advantageS of scBubbletree over qualitative analyses e.g. based on 2D UMAPs.
In addition to gene expression data, this dataset reports for each cell a cell type prediction at three levels of resolution (l1, l2 and l3). The cell types have been predicted based on marker genes, and we will use them as categorical cell annotations. As this dataset is part of a larger CITE-seq experiment, the cell type predictions are made based on scRNA-seq derived gene expressions and also on the cell-surface protein expression levels of up to 228 marker proteins.
In essence, we will perform the same steps as in Case study A:
k \(\rightarrow\) with \(f:\) get_kget_bubbletreeget_cat_feature_tiles,get_num_feature_tiles,get_num_feature_violinsupdate_bubbletree, get_gini, get_gini_kk and go back to 2.The raw gene expressions have already been preprocessed with Seurat. SCT transformation was used for normalization, followed by principal component analysis for dimensionality reduction. 15 lower-space features have been extracted and used to generate a 2D UMAP.
Lets load the data.
# To get the data used in this case study do the following steps:
# 1. download reference data from vignette:
# https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat
# 2. load SeuratDistk
# library(SeuratDisk, lib.loc = lib.loc)
# d <- LoadH5Seurat("case_study_B/pbmc_multimodal.h5seurat")
# save(d, file = "raw_data/Hao_2021/Hao_2021.RData")
# d <- get(load(file = "raw_data/Hao_2021/Hao_2021.RData"))
# d[["mt"]] <- PercentageFeatureSet(d, pattern = "^MT-")
# d@reductions$apca <- NULL
# d@reductions$aumap <- NULL
# d@reductions$spca <- NULL
# d@reductions$wnn.umap <- NULL
# d@reductions$umap <- NULL
Lets look at the number of cells per donor at a given sampling time:
table(d@meta.data$time, d@meta.data$donor)
FALSE
FALSE P1 P2 P3 P4 P5 P6 P7 P8
FALSE 0 6443 5978 4698 5307 7020 6093 8909 8916
FALSE 3 5917 5714 5002 5793 6933 7583 8200 9203
FALSE 7 5775 5513 4960 5990 7858 7066 8762 8131
Also, we can show the number of cells per donor and predicted cell type at the lowest resolution (l1):
table(d@meta.data$celltype.l1, d@meta.data$donor)
FALSE
FALSE P1 P2 P3 P4 P5 P6 P7 P8
FALSE B 715 1335 925 1296 1798 3229 3472 1030
FALSE CD4 T 6500 5717 4721 5103 2925 4457 5429 6149
FALSE CD8 T 3233 3497 1623 3049 3605 1503 3232 5727
FALSE DC 334 276 250 333 668 379 904 445
FALSE Mono 5036 3500 4088 5188 8022 6627 9196 7353
FALSE NK 1590 1610 2423 688 2760 3357 2303 3933
FALSE other 491 210 180 154 952 506 576 373
FALSE other T 236 1060 450 1279 1081 684 759 1240
How many distinct cell types are there at each resolution?
length(unique(d@meta.data$celltype.l1))
FALSE [1] 8
length(unique(d@meta.data$celltype.l2))
FALSE [1] 31
length(unique(d@meta.data$celltype.l3))
FALSE [1] 58
Next we will show the 2D UMAP. Cells are color coded according to their cell type predictions resolution level l1 (left UMAP) and l2 (right UMAP).
umap_data <- cbind(d@meta.data, d@reductions$umap@cell.embeddings)
umap_data$closest_cluster <- NA
umap_centers <- merge(x = aggregate(UMAP_1~celltype.l1, data = umap_data, FUN = median),
y = aggregate(UMAP_2~celltype.l1, data = umap_data, FUN = median),
by = "celltype.l1")
g_umap <- ggplot()+
geom_point(data = umap_data,
aes(x = UMAP_1, y = UMAP_2, col = celltype.l1), size = 0.25)+
geom_text_repel(data = umap_centers,
aes(x = UMAP_1, y = UMAP_2, label = celltype.l1),
min.segment.length = 0, size = 2)+
theme(legend.position = "none")+
guides(colour = guide_legend(nrow = 4,
override.aes = list(size=2)))
g_umap
umap_data <- cbind(d@meta.data, d@reductions$umap@cell.embeddings)
umap_data$closest_cluster <- NA
umap_centers <- merge(x = aggregate(UMAP_1~celltype.l2, data = umap_data, FUN = median),
y = aggregate(UMAP_2~celltype.l2, data = umap_data, FUN = median),
by = "celltype.l2")
g_umap <- ggplot()+
geom_point(data = umap_data,
aes(x = UMAP_1, y = UMAP_2, col = celltype.l2), size = 0.25)+
geom_text_repel(data = umap_centers,
aes(x = UMAP_1, y = UMAP_2, label = celltype.l2),
min.segment.length = 0, size = 2)+
theme(legend.position = "none")+
guides(colour = guide_legend(nrow = 4,
override.aes = list(size=2)))
g_umap
tsne_data <- cbind(d@meta.data, d@reductions$tsne@cell.embeddings)
tsne_data$closest_cluster <- NA
tsne_centers <- merge(x = aggregate(tSNE_1~celltype.l2, data = tsne_data, FUN = median),
y = aggregate(tSNE_2~celltype.l2, data = tsne_data, FUN = median),
by = "celltype.l2")
g_tsne <- ggplot()+
geom_point(data = tsne_data,
aes(x = tSNE_1, y = tSNE_2, col = celltype.l2), size = 0.25)+
geom_text_repel(data = tsne_centers,
aes(x = tSNE_1, y = tSNE_2, label = celltype.l2),
min.segment.length = 0, size = 2)+
theme(legend.position = "none")+
guides(colour = guide_legend(nrow = 4,
override.aes = list(size=2)))
g_tsne
tsne_data <- cbind(d@meta.data, d@reductions$tsne@cell.embeddings)
tsne_data$closest_cluster <- NA
tsne_centers <- merge(x = aggregate(tSNE_1~celltype.l2, data = tsne_data, FUN = median),
y = aggregate(tSNE_2~celltype.l2, data = tsne_data, FUN = median),
by = "celltype.l2")
g_umap <- ggplot()+
geom_point(data = tsne_data,
aes(x = tSNE_1, y = tSNE_2, col = celltype.l2), size = 0.25)+
geom_text_repel(data = tsne_centers,
aes(x = tSNE_1, y = tSNE_2, label = celltype.l2),
min.segment.length = 0, size = 2)+
theme(legend.position = "none")+
guides(colour = guide_legend(nrow = 4,
override.aes = list(size=2)))
g_umap
Now that the UMAP contains more than 161,000 cells from a heterogeneous tissue, we start to see the challenges associated with this approach:
massive overplotting
lack of compositional information
qualitative analysis
Now lets turn to scBubbletree. As earlier, our first goal is to determine k for the clustering of matrix \(A^{n \times f}\), which represents the low- dimensional feature space of the normalized gene expression matrix. We will use the first \(f\)=15 PCA dimensions as features. We selected \(f\)=15 based on the elbow plot below.
# This is the main input of scBubbletree
# -> matrix A with n=cells, f=features (from PCA)
A <- d@reductions$pca@cell.embeddings[, 1:15]
# meta data
meta <- d@meta.data
# quantitative features (gene expressions of marker genes) to be used later on:
# * GNLY, NKG7: NK cells
# * IL7R: CD4 T cells
# * CD8A: CD8 T cells
# * MS4A1: B cells
# * CD14, LYZ: CD14+ Monocytes
# * FCGR3A, MS4A7: FCGR3A+ Monocytes
# * FCER1A, CST3: Dendritic Cells
# * PPBP: Megakaryocytes
as <- as.matrix(t(d@assays$SCT@data[
rownames(d@assays$SCT@data) %in%
c("IL7R",
"CD14", "LYZ",
"MS4A1",
"CD8A",
"GNLY", "NKG7",
"FCGR3A", "MS4A7",
"FCER1A", "CST3",
"PPBP"), ]))
FALSE used (Mb) gc trigger (Mb) max used (Mb)
FALSE Ncells 7578289 404.8 12133015 648.0 12133015 648.0
FALSE Vcells 21866676 166.9 1182778123 9023.9 1292371894 9860.1
We will once again use the function get_k for clustering. Notice the modified parameter cv_prop=0.2, for faster execution of this large dataset:
if(redo_case_b) {
# Determine appropriate number of clusters (k)
b <- get_k(B = 10,
cv_prop = 0.15, # use 15% for gap stat.
ks = seq(from = 1, to = 40, by = 1),
x = A,
n_start = 100,
iter_max = 200,
kmeans_algorithm = "MacQueen",
cores = 20,
mini_output = F,
B_gap = 5)
if(dir.exists("case_study_B")==F) {
dir.create("case_study_B")
}
save(b, file = "case_study_B/b.RData")
cat("Done.")
} else {
b <- get(load("~/scBubbletree/case_study_B/b.RData"))
}
We see a more complex silhouette curve. This is normal given the complexity in our dataset, which is composed of many cell types with high and low relative frequencies. We see a maximum silhouette index at k=5. The index then drops sharply until k=11, followed by shallow increase until k=14 after the silhouette starts once again to decrease.
Elbows are not easy to detect from the Gap and WSS curves. It is quite evident, however, that both curves flatten at around k=15 to k=20.
k=18if(redo_case_b) {
k20 <- get_bubbletree(x = A,
k = 18,
seed = 4321,
n_start = 1000,
iter_max = 1000,
cores = 20,
B = 200,
N_eff = 200,
round_digits = 1,
show_simple_count = T)
save(k20, file = "case_study_B/k20.RData")
} else {
k20 <- get(load("~/scBubbletree/case_study_B/k20.RData"))
}
Summary of the bubbletree:
c1 <- meta$celltype.l1
c2 <- gsub(pattern = "Proliferating", replacement = 'prolif.', meta$celltype.l2)
c2 <- gsub(pattern = "CD56bright", replacement = 'CD56', x = c2)
donor <- meta$donor
# create tile plots
w0 <- get_cat_feature_tiles(d = k20,
a = c1, # res = 1
integrate_vertical = T,
round_digits = 0,
show_hclust = F,
tile_text_size = 2.75)
w1 <- get_cat_feature_tiles(d = k20,
a = c2, # res = 2
integrate_vertical = T,
round_digits = 0,
show_hclust = F,
tile_text_size = 2.25)
w2 <- get_cat_feature_tiles(d = k20,
a = donor,
integrate_vertical = T,
round_digits = 0,
show_hclust = F,
tile_text_size = 2.75)
c2 <- gsub(pattern = "Proliferating", replacement = 'prolif.', meta$celltype.l2)
c2 <- gsub(pattern = "CD56bright", replacement = 'CD56', x = c2)
w1 <- get_cat_feature_tiles(d = k20,
a = c2,
integrate_vertical = T,
round_digits = 0,
show_hclust = F,
tile_text_size = 2.25)
w2 <- get_cat_feature_tiles(d = k20,
a = c2,
integrate_vertical = F,
round_digits = 0,
show_hclust = F,
tile_text_size = 2.25)
Interestingly, the bubbles 1 and 2 contain many different cell types. Additional clustering of these bubbles might be necessary.
Can we show quantitatively that by increasing k we get “better” clustering in a semi-supervised way? Yes, we can use the Gini impurity index.
For this we will integrate the results obtained by function get_k with l1, l2 and l3 cell type predictions (labels), and show quantitatively the change in Gini index as a function of k. This is done with the function get_gini_k. One potential caveat of this approach is that some of the predictions might be inaccurate.
Lets invoke the function get_gini_k and pass the object b obtained earlier together with the cell type predictions:
b <- get(load("~/scBubbletree/case_study_B/b.RData"))
gini_l1 <- get_gini_k(labels = meta$celltype.l1,
get_k_obj = b)
gini_l2 <- get_gini_k(labels = meta$celltype.l2,
get_k_obj = b)
gini_l3 <- get_gini_k(labels = meta$celltype.l3,
get_k_obj = b)
We next plot the Gini scores for the labels at resolution l1, l2 and l3.
Summary of the above plot:
k approaches 20.Gene expression annotations can also be integrated to better explain the bubbles and the tree structure. Lets visualize the mean gene expression of some marker genes in the bubbles:
# This function build the nummeric annotations plot
w3 <- get_num_feature_tiles(d = k20,
as = as,
plot_title = "",
round_digits = 1,
tile_text_size = 2.5)
# Plot
(k20$tree|w3$w)+patchwork::plot_layout(widths = c(1, 1))
Second, we can visualize the distribution of each marker gene in each bubble using violin plots with get_num_feature_violins. This function uses the same input as get_num_feature_tiles.
Lets invoke this function now.
w4 <- get_num_feature_violins(d = k20,
as = as,
plot_title = "",
scales = 'free_x',
show_cells = F)
((k20$tree|w3$w)+patchwork::plot_layout(widths = c(1, 1)))/w4$w
# lets first create a variable showing pairs (donor x timepoint)
a <- paste0(meta$donor, '_', meta$time)
a <- factor(x = a, levels = sort(unique(a)))
# categorical donor meta data
w5 <- get_cat_feature_tiles(d = k20,
a = a,
integrate_vertical = T,
round_digits = 0,
show_hclust = F,
disable_hclust = T,
tile_text_size = 2.5)
w5$w
local = yes; long-range = no
g_pairs_umap <- ggplot(data = pair_dist)+
geom_point(aes(y = a_euc, x = u_euc, col = comparison), size = 0.25)+
geom_density_2d(aes(y = a_euc, x = u_euc), col = "orange")+
ylab(label = "Distance in PCA")+
xlab(label = "Distance in 2D UMAP")+
scale_color_manual(values = c("black", "darkgray"))+
theme(legend.position = "none")
g_pairs_umap
g_pairs_tsne <- ggplot(data = pair_dist)+
geom_point(aes(y = a_euc, x = t_euc, col = comparison), size = 0.25)+
geom_density_2d(aes(y = a_euc, x = t_euc), col = "orange")+
ylab(label = "Distance in PCA")+
xlab(label = "Distance in 2D t-SNE")+
scale_color_manual(values = c("black", "darkgray"))+
theme(legend.position = "none")
g_pairs_tsne
Hao, Yuhan, et al. “Integrated analysis of multimodal single-cell data.” Cell 184.13 (2021): 3573-3587.↩︎
https://satijalab.org/seurat/articles/multimodal_reference_mapping.html↩︎